# Load necessary packages
import os
import numpy as np
import pandas as pd
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from kneed import KneeLocator
import matplotlib.pyplot as plt
import gensim.downloader as api
import re
from sklearn.decomposition import PCA
import random

# Set Seed for Reproducibility
random.seed(1996)
#############################
# Load Data
df=pd.read_csv('model_text.csv')
# load pretrained word2vec embeddings
wv = api.load('word2vec-google-news-300')
### Estimation for Model in Paper ###
# Isolate issue statements
df_text = df['issue_final']
tag = df['candidate_id']

# Create tagged data 
tagged_data = []
for i in range(0, len(df)):
    string = str(df_text[i])
    cand_tag = [str(tag[i])]
    tagged_data.append(TaggedDocument(words = word_tokenize(string), tags = cand_tag))

### Estimation for Various Model Parameters ###
# Define the parameters for window sizes and hidden layer dimensions
window_sizes = [5, 6, 7]
hidden_dimensions = [100, 200, 300]

# Create an empty dataframe to store the results
results_df = pd.DataFrame(columns=['candidate_id'])

# Loop over the window sizes and hidden dimensions to estimate and save models
for window_size in window_sizes:
    for hidden_dimension in hidden_dimensions:
        # Build the Doc2Vec model
        model = Doc2Vec(vector_size=hidden_dimension, window=window_size, 
                        dm = 0, dbow_words = 1, min_count=5, 
                        workers=1, epochs=20, seed = 1996)
        model.build_vocab(tagged_data)
        model.wv.vectors = wv.vectors
        model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

        # Create list with no duplicates
        candidates = tag.drop_duplicates()

        # Isolate Document Vectors
        M = model.vector_size
        P = len(candidates)
        z = np.zeros((P, M))

        # Use a for loop to populate the matrix for each candidate-year combination
        for i in range(P):
            z[i, :] = model.dv[i]

        # Apply PCA to reduce the dimensionality
        pca_candidate = PCA(n_components=1)
        Zcand = pd.DataFrame(pca_candidate.fit_transform(z), columns=['dim1'])
        candidates_fixed = candidates.reset_index()
        Zcand['candidate'] = candidates_fixed['candidate_id']

        # Add the results to the dataframe
        column_name = f'window_{window_size}_hidden_{hidden_dimension}'
        results_df[column_name] = Zcand['dim1']

# Save the results to a CSV file
results_df.to_csv('ideology_estimates_robust.csv', index=False)



